{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "import gym\n",
    "import random\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import time\n",
    "from gym.envs.registration import register\n",
    "from IPython.display import clear_output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Observation space: Discrete(16)\n",
      "Action space: Discrete(4)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "gym.spaces.discrete.Discrete"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py\n",
    "# https://github.com/openai/gym/blob/master/gym/envs/__init__.py\n",
    "try:\n",
    "    register(\n",
    "        id='FrozenLakeNoSlip-v0',\n",
    "        entry_point='gym.envs.toy_text:FrozenLakeEnv',\n",
    "        kwargs={'map_name' : '4x4', 'is_slippery':False},\n",
    "        max_episode_steps=100,\n",
    "        reward_threshold=0.78, # optimum = .8196\n",
    "    )\n",
    "except:\n",
    "    pass\n",
    "\n",
    "# env_name = \"CartPole-v1\"\n",
    "# env_name = \"MountainCar-v0\"\n",
    "# env_name = \"MountainCarContinuous-v0\"\n",
    "# env_name = \"Acrobot-v1\"\n",
    "# env_name = \"Pendulum-v0\"\n",
    "env_name = \"FrozenLake-v0\"\n",
    "env_name = \"FrozenLakeNoSlip-v0\"\n",
    "env = gym.make(env_name)\n",
    "print(\"Observation space:\", env.observation_space)\n",
    "print(\"Action space:\", env.action_space)\n",
    "type(env.action_space)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Agent():\n",
    "    def __init__(self, env):\n",
    "        self.is_discrete = \\\n",
    "            type(env.action_space) == gym.spaces.discrete.Discrete\n",
    "        \n",
    "        if self.is_discrete:\n",
    "            self.action_size = env.action_space.n\n",
    "            print(\"Action size:\", self.action_size)\n",
    "        else:\n",
    "            self.action_low = env.action_space.low\n",
    "            self.action_high = env.action_space.high\n",
    "            self.action_shape = env.action_space.shape\n",
    "            print(\"Action range:\", self.action_low, self.action_high)\n",
    "        \n",
    "    def get_action(self, state):\n",
    "        if self.is_discrete:\n",
    "            action = random.choice(range(self.action_size))\n",
    "        else:\n",
    "            action = np.random.uniform(self.action_low,\n",
    "                                       self.action_high,\n",
    "                                       self.action_shape)\n",
    "        return action"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Action size: 4\n",
      "State size: 16\n"
     ]
    }
   ],
   "source": [
    "class QNAgent(Agent):\n",
    "    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):\n",
    "        super().__init__(env)\n",
    "        self.state_size = env.observation_space.n\n",
    "        print(\"State size:\", self.state_size)\n",
    "        \n",
    "        self.eps = 1.0\n",
    "        self.discount_rate = discount_rate\n",
    "        self.learning_rate = learning_rate\n",
    "        self.build_model()\n",
    "        \n",
    "        self.sess = tf.Session()\n",
    "        self.sess.run(tf.global_variables_initializer())\n",
    "        \n",
    "    def build_model(self):\n",
    "        tf.reset_default_graph()\n",
    "        self.state_in = tf.placeholder(tf.int32, shape=[1])\n",
    "        self.action_in = tf.placeholder(tf.int32, shape=[1])\n",
    "        self.target_in = tf.placeholder(tf.float32, shape=[1])\n",
    "        \n",
    "        self.state = tf.one_hot(self.state_in, depth=self.state_size)\n",
    "        self.action = tf.one_hot(self.action_in, depth=self.action_size)\n",
    "        \n",
    "        self.q_state = tf.layers.dense(self.state, units=self.action_size, name=\"q_table\")\n",
    "        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)\n",
    "        \n",
    "        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))\n",
    "        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)\n",
    "        \n",
    "    def get_action(self, state):\n",
    "        q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})\n",
    "        action_greedy = np.argmax(q_state)\n",
    "        action_random = super().get_action(state)\n",
    "        return action_random if random.random() < self.eps else action_greedy\n",
    "    \n",
    "    def train(self, experience):\n",
    "        state, action, next_state, reward, done = ([exp] for exp in experience)\n",
    "        \n",
    "        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})\n",
    "        q_next[done] = np.zeros([self.action_size])\n",
    "        q_target = reward + self.discount_rate * np.max(q_next)\n",
    "        \n",
    "        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}\n",
    "        self.sess.run(self.optimizer, feed_dict=feed)\n",
    "        \n",
    "        if experience[4]:\n",
    "            self.eps = self.eps * 0.99\n",
    "            \n",
    "    def __del__(self):\n",
    "        self.sess.close()\n",
    "        \n",
    "agent = QNAgent(env)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "s: 15 a: 2\n",
      "Episode: 99, Total reward: 71.0, eps: 0.13397967485796175\n",
      "  (Right)\n",
      "SFFF\n",
      "FHFH\n",
      "FFFH\n",
      "HFF\u001b[41mG\u001b[0m\n",
      "[[ 0.26639792  0.23388803  0.2838942   0.17449419]\n",
      " [ 0.27005327 -0.4024756   0.1345939  -0.03563018]\n",
      " [ 0.00683864  0.11768234  0.1508714  -0.0175943 ]\n",
      " [ 0.1645656  -0.4792729   0.1205796  -0.1818824 ]\n",
      " [ 0.24408905  0.2570015  -0.39948407  0.18414727]\n",
      " [-0.51439786 -0.14780176  0.41053963 -0.07503477]\n",
      " [-0.4445371   0.1159418  -0.51110405 -0.03309001]\n",
      " [-0.20044279 -0.25773728  0.26571983 -0.49883124]\n",
      " [ 0.31056625 -0.52676606  0.41068304  0.20587952]\n",
      " [ 0.33791235  0.3192833   0.41197717 -0.54344785]\n",
      " [ 0.3656563   0.13440336 -0.32820502 -0.02268323]\n",
      " [ 0.19982225 -0.49265525  0.4245258   0.05591565]\n",
      " [ 0.14388543  0.22361499 -0.33384812  0.517786  ]\n",
      " [-0.30385983  0.2902795   0.47450408  0.2592849 ]\n",
      " [ 0.13970053  0.21021995  0.5043464   0.2025718 ]\n",
      " [ 0.15857393  0.19134241  0.45319474 -0.47139642]]\n"
     ]
    }
   ],
   "source": [
    "total_reward = 0\n",
    "for ep in range(100):\n",
    "    state = env.reset()\n",
    "    done = False\n",
    "    while not done:\n",
    "        action = agent.get_action(state)\n",
    "        next_state, reward, done, info = env.step(action)\n",
    "        agent.train((state,action,next_state,reward,done))\n",
    "        state = next_state\n",
    "        total_reward += reward\n",
    "        \n",
    "        print(\"s:\", state, \"a:\", action)\n",
    "        print(\"Episode: {}, Total reward: {}, eps: {}\".format(ep,total_reward,agent.eps))\n",
    "        env.render()\n",
    "        with tf.variable_scope(\"q_table\", reuse=True):\n",
    "            weights = agent.sess.run(tf.get_variable(\"kernel\"))\n",
    "            print(weights)\n",
    "        time.sleep(0.05)\n",
    "        clear_output(wait=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
